CustomExtractFeatures
对输入字符串序列逐条进行特征提取,生成对应的离散标签(label)和权重(weight)。
该算子主要用于文本特征工程,包含以下逻辑:
对黑名单字符串进行过滤
对非黑名单字符串计算哈希特征
根据字符串中空格数量生成权重
\[\begin{split}\text{label}_i =
\begin{cases}
0, & \text{if } s_i \in \text{blacklist} \\
\operatorname{hash}(s_i) \bmod K, & \text{otherwise}
\end{cases}\end{split}\]
\[\begin{split}\text{weight}_i =
\begin{cases}
0, & \text{if } s_i \in \text{blacklist} \\
\text{space\_count}(s_i) + 1, & \text{otherwise}
\end{cases}\end{split}\]
其中:
\(s_i\) 表示第 \(i\) 条输入字符串
\(K\) 为固定的哈希空间大小(默认 \(10^6\))
blacklist= {"<S>","<E>","<S> <E>"}
- 输入:
string_pointers - 指向字符串首地址的指针数组。
string_lengths - 各字符串对应的长度数组。
num_strings - 输入字符串的数量。
core_mask - 核掩码。
- 输出:
output_labels - 输出标签数组地址(int32)。
output_weights - 输出权重数组地址。
- 支持平台:
FT78NEMT7004
备注
- FT78NE 支持的数据类型:
fp32, fp64
int8, int16, int32
cplx64, cplx128
- MT7004 支持的数据类型:
fp16, fp32
int16, int32
cplx64
当
num_strings == 0时,输出的label和weight被置为 0黑名单字符串不会参与哈希计算
共享存储版本:
-
void fp_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, float *output_weights, int core_mask)
-
void dp_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, double *output_weights, int core_mask)
-
void i8_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, int8_t *output_weights, int core_mask)
-
void i16_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, int16_t *output_weights, int core_mask)
-
void i32_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, int32_t *output_weights, int core_mask)
-
void c64_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, float *output_weights, int core_mask)
-
void c128_extract_features_s(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, double *output_weights, int core_mask)
C调用示例:
1// FT78NE 多核示例
2#include <stdio.h>
3#include <custom_extract_features.h>
4
5int main(int argc, char* argv[]) {
6 char* strings[] = {"hello world", "<S>", "test data"};
7 int lengths[] = {11, 3, 9};
8 int num_strings = 3;
9
10 int *labels = (int *)0xA0000000;
11 float *weights = (float *)0xB0000000;
12
13 int core_mask = 0xff;
14
15 fp_extract_features_s(strings, lengths, num_strings, labels, weights, core_mask);
16 return 0;
17}
私有存储版本:
-
void fp_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, float *output_weights)
-
void dp_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, double *output_weights)
-
void i8_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, int8_t *output_weights)
-
void i16_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, int16_t *output_weights)
-
void i32_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, int32_t *output_weights)
-
void c64_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, float *output_weights)
-
void c128_extract_features_p(char **string_pointers, int *string_lengths, int num_strings, int *output_labels, double *output_weights)
C调用示例:
1// MT7004 单核示例
2#include <stdio.h>
3#include <custom_extract_features.h>
4
5int main(int argc, char* argv[]) {
6 char* strings[] = {"example text"};
7 int lengths[] = {12};
8 int num_strings = 1;
9
10 int *labels = (int *)0x10000000;
11 float *weights = (float *)0x11000000;
12
13 fp_extract_features_p(strings, lengths, num_strings, labels, weights);
14 return 0;
15}